IMPORT EVERYTHING
# Commands to install some of the libraries in-case if they are not installed
# Any other library that needs to be installed just use: !pip install <library name>
# !pip install seaborn
# !pip install missingno
# !pip install xgboost
# !pip install catboost
# !pip install regex
# !pip install sklearn
# !pip install pandas
# !pip install numpy
# !pip install imblearn
# pip install lightgbm
# !pip install --upgrade matplotlib
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
import matplotlib.pyplot as plt #graphs and plots
import seaborn as sns #data visualizations
import csv # Some extra functionalities for csv files - reading it as a dictionary
from lightgbm import LGBMClassifier #sklearn is for machine learning and statistical modeling including classification, regression, clustering and dimensionality reduction
from sklearn.model_selection import train_test_split, cross_validate #break up dataset into train and test sets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# importing python library for working with missing data
import missingno as msno
# To install missingno use: !pip install missingno
import re # This library is used to perform regex pattern matching
# import various functions from sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, classification_report, make_scorer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)
Import additional items as needed...
from sklearn.model_selection import KFold,cross_val_score, RepeatedStratifiedKFold,StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.compose import make_column_transformer
from imblearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyClassifier
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score,\
precision_score, recall_score, roc_auc_score,\
ConfusionMatrixDisplay, classification_report, RocCurveDisplay, f1_score
from sklearn.linear_model import LinearRegression
import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
warnings.filterwarnings("ignore")
diabetic = pd.read_csv('D:/GitHub/HHA550_Analysis/finaldata/cleaned_diabetic_data_final_presentation1.csv')
diabetic.head()
| Unnamed: 0 | race | gender | age | admission_type_id | discharge_disposition_id | admission_source_id | time_in_hospital | num_lab_procedures | num_procedures | ... | citoglipton | insulin | glyburide-metformin | glipizide-metformin | glimepiride-pioglitazone | metformin-rosiglitazone | metformin-pioglitazone | change | diabetesMed | readmitted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 3 | 0 | 1 | 1 | 1 | 7 | 3 | 59 | 0 | ... | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
| 1 | 2 | 1 | 0 | 2 | 1 | 1 | 7 | 2 | 11 | 5 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 3 | 3 | 1 | 3 | 1 | 1 | 7 | 2 | 44 | 1 | ... | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
| 3 | 4 | 3 | 1 | 4 | 1 | 1 | 7 | 1 | 51 | 0 | ... | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
| 4 | 5 | 3 | 1 | 5 | 2 | 1 | 2 | 3 | 31 | 6 | ... | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 44 columns
y = diabetic['readmitted']
print(f'Percentage of patient who readmitted within 30 days: % {round(y.value_counts(normalize=True)[1]*100,2)} --> ({y.value_counts()[1]} patient)\nPercentage of patient did not readmit within 30 days: % {round(y.value_counts(normalize=True)[0]*100,2)} --> ({y.value_counts()[1]} patient)')
Percentage of patient who readmitted within 30 days: % 11.23 --> (9486 patient) Percentage of patient did not readmit within 30 days: % 88.77 --> (9486 patient)
fig = px.histogram(diabetic, x="readmitted", title='Readmitted', width=400, height=400)
fig.show()
There is an imbalance in the data regarding patient readmission within 30 days.
Since there is an imbalance, some metrics such as accuracy give us misleading results.
msno.matrix(diabetic)
<Axes: >
No missing values.
Since the target data is skewed, the best metric for this binary classification problem would be Area Under the ROC Curve (AUC).
diabetic.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 84447 entries, 0 to 84446 Data columns (total 44 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 84447 non-null int64 1 race 84447 non-null int64 2 gender 84447 non-null int64 3 age 84447 non-null int64 4 admission_type_id 84447 non-null int64 5 discharge_disposition_id 84447 non-null int64 6 admission_source_id 84447 non-null int64 7 time_in_hospital 84447 non-null int64 8 num_lab_procedures 84447 non-null int64 9 num_procedures 84447 non-null int64 10 num_medications 84447 non-null int64 11 number_outpatient 84447 non-null int64 12 number_emergency 84447 non-null int64 13 number_inpatient 84447 non-null int64 14 diag_1 84447 non-null int64 15 number_diagnoses 84447 non-null int64 16 max_glu_serum 84447 non-null int64 17 A1Cresult 84447 non-null int64 18 metformin 84447 non-null int64 19 repaglinide 84447 non-null int64 20 nateglinide 84447 non-null int64 21 chlorpropamide 84447 non-null int64 22 glimepiride 84447 non-null int64 23 acetohexamide 84447 non-null int64 24 glipizide 84447 non-null int64 25 glyburide 84447 non-null int64 26 tolbutamide 84447 non-null int64 27 pioglitazone 84447 non-null int64 28 rosiglitazone 84447 non-null int64 29 acarbose 84447 non-null int64 30 miglitol 84447 non-null int64 31 troglitazone 84447 non-null int64 32 tolazamide 84447 non-null int64 33 examide 84447 non-null int64 34 citoglipton 84447 non-null int64 35 insulin 84447 non-null int64 36 glyburide-metformin 84447 non-null int64 37 glipizide-metformin 84447 non-null int64 38 glimepiride-pioglitazone 84447 non-null int64 39 metformin-rosiglitazone 84447 non-null int64 40 metformin-pioglitazone 84447 non-null int64 41 change 84447 non-null int64 42 diabetesMed 84447 non-null int64 43 readmitted 84447 non-null int64 dtypes: int64(44) memory usage: 28.3 MB
There are no missing values. Every column has 83254 non-null values.
Look at the data elements using diabetic_head()
Look at the dtype using diabetic.info()
diabetic.head()
| Unnamed: 0 | race | gender | age | admission_type_id | discharge_disposition_id | admission_source_id | time_in_hospital | num_lab_procedures | num_procedures | ... | citoglipton | insulin | glyburide-metformin | glipizide-metformin | glimepiride-pioglitazone | metformin-rosiglitazone | metformin-pioglitazone | change | diabetesMed | readmitted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 3 | 0 | 1 | 1 | 1 | 7 | 3 | 59 | 0 | ... | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
| 1 | 2 | 1 | 0 | 2 | 1 | 1 | 7 | 2 | 11 | 5 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 3 | 3 | 1 | 3 | 1 | 1 | 7 | 2 | 44 | 1 | ... | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
| 3 | 4 | 3 | 1 | 4 | 1 | 1 | 7 | 1 | 51 | 0 | ... | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
| 4 | 5 | 3 | 1 | 5 | 2 | 1 | 2 | 3 | 31 | 6 | ... | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 44 columns
diabetic.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 84447 entries, 0 to 84446 Data columns (total 44 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 84447 non-null int64 1 race 84447 non-null int64 2 gender 84447 non-null int64 3 age 84447 non-null int64 4 admission_type_id 84447 non-null int64 5 discharge_disposition_id 84447 non-null int64 6 admission_source_id 84447 non-null int64 7 time_in_hospital 84447 non-null int64 8 num_lab_procedures 84447 non-null int64 9 num_procedures 84447 non-null int64 10 num_medications 84447 non-null int64 11 number_outpatient 84447 non-null int64 12 number_emergency 84447 non-null int64 13 number_inpatient 84447 non-null int64 14 diag_1 84447 non-null int64 15 number_diagnoses 84447 non-null int64 16 max_glu_serum 84447 non-null int64 17 A1Cresult 84447 non-null int64 18 metformin 84447 non-null int64 19 repaglinide 84447 non-null int64 20 nateglinide 84447 non-null int64 21 chlorpropamide 84447 non-null int64 22 glimepiride 84447 non-null int64 23 acetohexamide 84447 non-null int64 24 glipizide 84447 non-null int64 25 glyburide 84447 non-null int64 26 tolbutamide 84447 non-null int64 27 pioglitazone 84447 non-null int64 28 rosiglitazone 84447 non-null int64 29 acarbose 84447 non-null int64 30 miglitol 84447 non-null int64 31 troglitazone 84447 non-null int64 32 tolazamide 84447 non-null int64 33 examide 84447 non-null int64 34 citoglipton 84447 non-null int64 35 insulin 84447 non-null int64 36 glyburide-metformin 84447 non-null int64 37 glipizide-metformin 84447 non-null int64 38 glimepiride-pioglitazone 84447 non-null int64 39 metformin-rosiglitazone 84447 non-null int64 40 metformin-pioglitazone 84447 non-null int64 41 change 84447 non-null int64 42 diabetesMed 84447 non-null int64 43 readmitted 84447 non-null int64 dtypes: int64(44) memory usage: 28.3 MB
Seperate Categorical and Numerical elements
categorical = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'diag_1',
'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone',
'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']
numerical = ['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
diabetic[numerical].describe()
| age | time_in_hospital | num_lab_procedures | num_procedures | num_medications | number_outpatient | number_emergency | number_inpatient | number_diagnoses | |
|---|---|---|---|---|---|---|---|---|---|
| count | 84447.000000 | 84447.000000 | 84447.000000 | 84447.000000 | 84447.000000 | 84447.000000 | 84447.000000 | 84447.000000 | 84447.000000 |
| mean | 6.084467 | 4.382737 | 43.656234 | 1.340107 | 15.997833 | 0.356354 | 0.208521 | 0.652386 | 7.518538 |
| std | 1.604887 | 2.977027 | 19.390216 | 1.708996 | 8.121821 | 1.257435 | 0.981372 | 1.285440 | 1.912526 |
| min | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 5.000000 | 2.000000 | 33.000000 | 0.000000 | 10.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 |
| 50% | 6.000000 | 4.000000 | 45.000000 | 1.000000 | 15.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 |
| 75% | 7.000000 | 6.000000 | 57.000000 | 2.000000 | 20.000000 | 0.000000 | 0.000000 | 1.000000 | 9.000000 |
| max | 9.000000 | 14.000000 | 132.000000 | 6.000000 | 81.000000 | 42.000000 | 76.000000 | 21.000000 | 16.000000 |
negative = left skew
positive = right skew
0 = no skew
diabetic[numerical].skew()
age -0.637067 time_in_hospital 1.144088 num_lab_procedures -0.327120 num_procedures 1.317758 num_medications 1.329845 number_outpatient 8.878266 number_emergency 22.791751 number_inpatient 3.538992 number_diagnoses -0.967006 dtype: float64
diabetic[numerical].hist(figsize=(20,10));
print (f'{round(diabetic["gender"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="gender", title='Gender', width=500, height=500)
fig.show()
gender 0 54.03 1 45.97 Name: proportion, dtype: float64
print (f'{round(diabetic["race"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="race", title='race', width=600, height=600)
fig.show()
race 3 75.79 1 20.22 4 1.81 5 1.52 2 0.66 Name: proportion, dtype: float64
print (f'{round(diabetic["admission_type_id"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="admission_type_id", title='admission_type_id', width=750, height=750)
fig.show()
admission_type_id 1 59.32 3 20.69 2 19.95 7 0.02 4 0.01 Name: proportion, dtype: float64
print (f'{round(diabetic["A1Cresult"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="A1Cresult", title='A1Cresult', width=750, height=750)
fig.show()
A1Cresult 0 82.97 3 7.94 1 5.27 2 3.82 Name: proportion, dtype: float64
print (f'{round(diabetic["diabetesMed"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="diabetesMed", title='diabetesMed', width=750, height=750)
fig.show()
diabetesMed 1 77.54 0 22.46 Name: proportion, dtype: float64
print (f'{round(diabetic["insulin"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="insulin", title='insulin', width=750, height=750)
fig.show()
insulin 0 45.23 2 31.03 1 12.55 3 11.19 Name: proportion, dtype: float64
print (f'{round(diabetic["age"].value_counts(normalize=True)*100,2)}')
fig = px.histogram(diabetic, x="age", title='age', width=750, height=750)
fig.show()
age 7 25.10 6 22.31 5 17.09 8 16.94 4 9.44 3 3.75 9 2.75 2 1.70 1 0.74 0 0.17 Name: proportion, dtype: float64
print (f'A male has a probability of {round(diabetic[diabetic["gender"]==1]["readmitted"].mean()*100,2)} % to readmit within 30 days')
print()
print (f'A female has a probability of {round(diabetic[diabetic["gender"]==0]["readmitted"].mean()*100,2)} % to readmit within 30 days')
A male has a probability of 11.16 % to readmit within 30 days A female has a probability of 11.3 % to readmit within 30 days
fig = px.histogram(diabetic, x="gender", color="readmitted",width=600, height=600)
fig.show()
print (f'A Caucasian has a probability of {round(diabetic[diabetic["race"]==3]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
print (f'A African American has a probability of {round(diabetic[diabetic["race"]==1]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
print (f'A Asian has a probability of {round(diabetic[diabetic["race"]==2]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
print (f'A Hispanic has a probability of {round(diabetic[diabetic["race"]==4]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
print (f'A Other has a probability of {round(diabetic[diabetic["race"]==5]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
A Caucasian has a probability of 11.36 % of readmitting within 30 days A African American has a probability of 11.06 % of readmitting within 30 days A Asian has a probability of 10.38 % of readmitting within 30 days A Hispanic has a probability of 9.56 % of readmitting within 30 days A Other has a probability of 9.45 % of readmitting within 30 days
fig = px.histogram(diabetic, x="race", color="readmitted",width=600, height=600)
fig.show()
print (f'A patient admitted for "Emergency" has a probability of {round(diabetic[diabetic["admission_type_id"]==1]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
print (f'A patient admitted for "Urgent" has a probability of {round(diabetic[diabetic["admission_type_id"]==2]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
print (f'A patient admitted for "Elective" has a probability of {round(diabetic[diabetic["admission_type_id"]==3]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
print (f'A patient admitted for "Newborn" has a probability of {round(diabetic[diabetic["admission_type_id"]==4]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
print (f'A patient admitted for "Trauma Center" has a probability of {round(diabetic[diabetic["admission_type_id"]==7]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
A patient admitted for "Emergency" has a probability of 11.54 % of readmitting within 30 days A patient admitted for "Urgent" has a probability of 11.26 % of readmitting within 30 days A patient admitted for "Elective" has a probability of 10.35 % of readmitting within 30 days A patient admitted for "Newborn" has a probability of 10.0 % of readmitting within 30 days A patient admitted for "Trauma Center" has a probability of 0.0 % of readmitting within 30 days
fig = px.histogram(diabetic, x="admission_type_id", color="readmitted",width=600, height=600)
fig.show()
print (f'A patient with an A1Cresult of "None" has a probability of {round(diabetic[diabetic["A1Cresult"]==0]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
print (f'A patient with an A1Cresult of "Norm" has a probability of {round(diabetic[diabetic["A1Cresult"]==1]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
print (f'A patient with an A1Cresult of ">7" has a probability of {round(diabetic[diabetic["A1Cresult"]==2]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
print (f'A patient with an A1Cresult of ">8" has a probability of {round(diabetic[diabetic["A1Cresult"]==3]["readmitted"].mean()*100,2)} % of readmitting within 30 days')
print()
A patient with an A1Cresult of "None" has a probability of 11.53 % of readmitting within 30 days A patient with an A1Cresult of "Norm" has a probability of 9.55 % of readmitting within 30 days A patient with an A1Cresult of ">7" has a probability of 10.04 % of readmitting within 30 days A patient with an A1Cresult of ">8" has a probability of 9.84 % of readmitting within 30 days
fig = px.histogram(diabetic, x="A1Cresult", color="readmitted",width=600, height=600)
fig.show()
from sklearn.metrics import mutual_info_score
def cat_mut_inf(series):
return mutual_info_score(series, diabetic['readmitted'])
diabetic_cat = diabetic[categorical].apply(cat_mut_inf)
diabetic_cat = diabetic_cat.sort_values(ascending=False).to_frame(name='mutual_info_score')
diabetic_cat
| mutual_info_score | |
|---|---|
| readmitted | 3.513609e-01 |
| discharge_disposition_id | 8.159116e-03 |
| insulin | 1.011328e-03 |
| diag_1 | 1.000959e-03 |
| diabetesMed | 3.780887e-04 |
| change | 2.860761e-04 |
| metformin | 2.782392e-04 |
| admission_source_id | 2.582642e-04 |
| A1Cresult | 2.226939e-04 |
| admission_type_id | 1.375703e-04 |
| glipizide | 6.616885e-05 |
| race | 6.380132e-05 |
| repaglinide | 5.370252e-05 |
| pioglitazone | 5.324613e-05 |
| glimepiride | 3.597550e-05 |
| miglitol | 3.188722e-05 |
| rosiglitazone | 2.577645e-05 |
| chlorpropamide | 1.407742e-05 |
| glyburide | 1.399542e-05 |
| glyburide-metformin | 1.322640e-05 |
| acarbose | 1.030866e-05 |
| nateglinide | 9.086540e-06 |
| troglitazone | 2.822069e-06 |
| gender | 2.384694e-06 |
| tolbutamide | 2.138270e-06 |
| glimepiride-pioglitazone | 1.411026e-06 |
| metformin-pioglitazone | 1.411026e-06 |
| acetohexamide | 1.411026e-06 |
| glipizide-metformin | 1.074420e-06 |
| tolazamide | 5.743538e-08 |
| metformin-rosiglitazone | 0.000000e+00 |
| citoglipton | 0.000000e+00 |
| examide | 0.000000e+00 |
diabetic[numerical].corr()
| age | time_in_hospital | num_lab_procedures | num_procedures | num_medications | number_outpatient | number_emergency | number_inpatient | number_diagnoses | |
|---|---|---|---|---|---|---|---|---|---|
| age | 1.000000 | 0.107300 | 0.031678 | -0.022395 | 0.047998 | 0.020251 | -0.088964 | -0.041148 | 0.254452 |
| time_in_hospital | 0.107300 | 1.000000 | 0.326188 | 0.188776 | 0.469876 | -0.000524 | -0.006881 | 0.069348 | 0.231017 |
| num_lab_procedures | 0.031678 | 0.326188 | 1.000000 | 0.035141 | 0.280833 | 0.021724 | 0.010634 | 0.042225 | 0.158492 |
| num_procedures | -0.022395 | 0.188776 | 0.035141 | 1.000000 | 0.375691 | -0.019010 | -0.037061 | -0.067005 | 0.069733 |
| num_medications | 0.047998 | 0.469876 | 0.280833 | 0.375691 | 1.000000 | 0.046070 | 0.014358 | 0.068944 | 0.263503 |
| number_outpatient | 0.020251 | -0.000524 | 0.021724 | -0.019010 | 0.046070 | 1.000000 | 0.093707 | 0.117058 | 0.093438 |
| number_emergency | -0.088964 | -0.006881 | 0.010634 | -0.037061 | 0.014358 | 0.093707 | 1.000000 | 0.270503 | 0.049938 |
| number_inpatient | -0.041148 | 0.069348 | 0.042225 | -0.067005 | 0.068944 | 0.117058 | 0.270503 | 1.000000 | 0.105193 |
| number_diagnoses | 0.254452 | 0.231017 | 0.158492 | 0.069733 | 0.263503 | 0.093438 | 0.049938 | 0.105193 | 1.000000 |
There are barely any correlation between numerical features.
diabetic.groupby('readmitted')[numerical].mean()
| age | time_in_hospital | num_lab_procedures | num_procedures | num_medications | number_outpatient | number_emergency | number_inpatient | number_diagnoses | |
|---|---|---|---|---|---|---|---|---|---|
| readmitted | |||||||||
| 0 | 6.073505 | 4.336842 | 43.514067 | 1.347914 | 15.884246 | 0.347114 | 0.187337 | 0.576847 | 7.483785 |
| 1 | 6.171094 | 4.745414 | 44.779675 | 1.278410 | 16.895425 | 0.429370 | 0.375922 | 1.249315 | 7.793169 |
diabetic[['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'readmitted']].corr()
| age | time_in_hospital | num_lab_procedures | num_procedures | num_medications | number_outpatient | number_emergency | number_inpatient | number_diagnoses | readmitted | |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 1.000000 | 0.107300 | 0.031678 | -0.022395 | 0.047998 | 0.020251 | -0.088964 | -0.041148 | 0.254452 | 0.019202 |
| time_in_hospital | 0.107300 | 1.000000 | 0.326188 | 0.188776 | 0.469876 | -0.000524 | -0.006881 | 0.069348 | 0.231017 | 0.043338 |
| num_lab_procedures | 0.031678 | 0.326188 | 1.000000 | 0.035141 | 0.280833 | 0.021724 | 0.010634 | 0.042225 | 0.158492 | 0.020611 |
| num_procedures | -0.022395 | 0.188776 | 0.035141 | 1.000000 | 0.375691 | -0.019010 | -0.037061 | -0.067005 | 0.069733 | -0.012842 |
| num_medications | 0.047998 | 0.469876 | 0.280833 | 0.375691 | 1.000000 | 0.046070 | 0.014358 | 0.068944 | 0.263503 | 0.039314 |
| number_outpatient | 0.020251 | -0.000524 | 0.021724 | -0.019010 | 0.046070 | 1.000000 | 0.093707 | 0.117058 | 0.093438 | 0.020657 |
| number_emergency | -0.088964 | -0.006881 | 0.010634 | -0.037061 | 0.014358 | 0.093707 | 1.000000 | 0.270503 | 0.049938 | 0.060681 |
| number_inpatient | -0.041148 | 0.069348 | 0.042225 | -0.067005 | 0.068944 | 0.117058 | 0.270503 | 1.000000 | 0.105193 | 0.165195 |
| number_diagnoses | 0.254452 | 0.231017 | 0.158492 | 0.069733 | 0.263503 | 0.093438 | 0.049938 | 0.105193 | 1.000000 | 0.051082 |
| readmitted | 0.019202 | 0.043338 | 0.020611 | -0.012842 | 0.039314 | 0.020657 | 0.060681 | 0.165195 | 0.051082 | 1.000000 |
fig = px.scatter(diabetic, x='age', y='number_diagnoses', title='Age & # of Diagnoses ',color='readmitted', hover_data = diabetic[['readmitted']])
fig.show()
fig = px.scatter(diabetic, x='num_lab_procedures', y='num_medications', title='# Lab Procedures and Medications',color='readmitted', hover_data = diabetic[['readmitted']])
fig.show()
diabetic = pd.get_dummies(diabetic, drop_first = False)
train_df, valid_df, test_df = np.split(diabetic.sample(frac=1, random_state=42),
[int(.7*len(diabetic)), int(0.85*len(diabetic))])
train_df = train_df.reset_index(drop = True)
valid_df = valid_df.reset_index(drop = True)
test_df = test_df.reset_index(drop = True)
diabetic.readmitted.value_counts()
readmitted 0 74961 1 9486 Name: count, dtype: int64
train_df.readmitted.value_counts()
readmitted 0 52499 1 6613 Name: count, dtype: int64
valid_df.readmitted.value_counts()
readmitted 0 11230 1 1437 Name: count, dtype: int64
test_df.readmitted.value_counts()
readmitted 0 11232 1 1436 Name: count, dtype: int64
Imbalance in the data means that one of the classes in the data is too less as compared to the others. Typically, it is better to balance the data in some way to give the positives more weight. There are 3 strategies that are typically utilized:
Sub-sample the more dominant class: use a random subset of the negatives Over-sample the imbalanced class: use the same positive samples multiple times Create synthetic positive data Usually, you will want to use the latter two methods if you only have a handful of positive cases. Since we have a few thousand positive cases, let's use the sub-sample approach. Here, we will create a balanced training data set that has 50% positive and 50% negative. You can also play with this ratio to see if you can get an improvement.
def calc_prevalence(y_actual):
'''
This function is to understand the ratio/distribution of the classes that we are going to predict for.
Params:
1. y_actual: The target feature
Return:
1. (sum(y_actual)/len(y_actual)): The ratio of the postive class in the comlpete data.
'''
return (sum(y_actual)/len(y_actual))
# split the training data into positive and negative
rows_pos = train_df.readmitted == 1
df_train_pos = train_df.loc[rows_pos]
df_train_neg = train_df.loc[~rows_pos]
# merge the balanced data
sample_size = min(len(df_train_pos), len(df_train_neg))
diabetic_df_balanced = pd.concat([df_train_pos, df_train_neg.sample(n=sample_size, random_state=111)], axis=0)
# shuffle the order of training samples
diabetic_df_balanced = diabetic_df_balanced.sample(n=len(diabetic_df_balanced), random_state=42).reset_index(drop=True)
print('Train balanced prevalence(n=%d):%.3f'%(len(diabetic_df_balanced), \
calc_prevalence(diabetic_df_balanced.readmitted.values)))
Train balanced prevalence(n=13226):0.500
diabetic_df_balanced.readmitted.value_counts()
readmitted 1 6613 0 6613 Name: count, dtype: int64
X_train = diabetic_df_balanced.drop('readmitted',axis=1)
y_train = diabetic_df_balanced['readmitted']
X_valid = valid_df.drop('readmitted',axis=1)
y_valid = valid_df['readmitted']
X_test = test_df.drop('readmitted',axis=1)
y_test = test_df['readmitted']
scaler=StandardScaler()
X_train[['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient']] = pd.DataFrame(scaler.fit_transform(X_train[['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient']]),columns=['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient'])
X_valid[['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient']] = pd.DataFrame(scaler.transform(X_valid[['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient']]),columns=['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient'])
X_test[['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient']] = pd.DataFrame(scaler.transform(X_test[['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient']]),columns=['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital', 'num_lab_procedures', 'A1Cresult', 'diabetesMed', 'number_inpatient'])
def calc_specificity(y_actual, y_pred, thresh):
# calculates specificity
return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)
def print_report(y_actual, y_pred, thresh = 0.5):
'''
This function calculates all the metrics to asses the machine learning models.
Params:
1. y_actual: The actual values for the target variable.
2. y_pred: The predicted values for the target variable.
3. thresh: The threshold for the probability to be considered as a positive class. Default value 0.5
Return:
1. AUC
2. Accuracy
3. Recall
4. Precision
5. Specificity
'''
auc = roc_auc_score(y_actual, y_pred)
accuracy = accuracy_score(y_actual, (y_pred > thresh))
recall = recall_score(y_actual, (y_pred > thresh))
precision = precision_score(y_actual, (y_pred > thresh))
specificity = calc_specificity(y_actual, y_pred, thresh)
print('AUC:%.3f'%auc)
print('accuracy:%.3f'%accuracy)
print('recall:%.3f'%recall)
print('precision:%.3f'%precision)
print('specificity:%.3f'%specificity)
print('prevalence:%.3f'%calc_prevalence(y_actual))
print(' ')
return auc, accuracy, recall, precision, specificity
lnr = LinearRegression()
lnr.fit(X_train, y_train)
y_valid_preds = lnr.predict(X_valid)
y_valid_preds
array([0.69345324, 0.41855594, 0.76909945, ..., 0.49366581, 0.49793699,
0.431336 ])
diabetic['readmitted'].value_counts()
readmitted 0 74961 1 9486 Name: count, dtype: int64
lr=LogisticRegression(random_state = 42, solver = 'newton-cg', max_iter = 200)
lr.fit(X_train, y_train)
y_valid_preds = lr.predict_proba(X_valid)[:,1]
print('Metrics for Validation data:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds, 0.5)
Metrics for Validation data: AUC:0.644 accuracy:0.662 recall:0.515 precision:0.171 specificity:0.680 prevalence:0.113
importance = abs(lr.coef_[0])
indices = np.argsort(importance)[::-1][:5]
features = X_train.columns
print('Top 5 important variables:')
for i in indices:
print(features[i], importance[i])
Top 5 important variables: tolbutamide 0.5691542362986044 tolazamide 0.5490985038584199 number_inpatient 0.4555662342614939 miglitol 0.3715310590819527 chlorpropamide 0.1787726167684798
knn = KNeighborsClassifier(n_neighbors = 100)
knn.fit(X_train, y_train)
knn_preds = knn.predict_proba(X_valid)[:,1]
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
lr_valid_precision, lr_valid_specificity = print_report(y_valid,knn_preds, 0.5)
AUC:0.492 accuracy:0.508 recall:0.473 precision:0.110 specificity:0.440 prevalence:0.113
# Corrected the loss parameter to 'log_loss'
sgdc = SGDClassifier(loss='log_loss', alpha=0.1, random_state=42)
sgdc.fit(X_train, y_train)
sgd_preds = sgdc.predict_proba(X_valid)[:, 1]
print('Stochastic Gradient Descent')
print('Validation:')
sgdc_valid_auc, sgdc_valid_accuracy, sgdc_valid_recall, \
sgdc_valid_precision, sgdc_valid_specificity = print_report(y_valid,sgd_preds, 0.5)
Stochastic Gradient Descent Validation: AUC:0.499 accuracy:0.885 recall:0.003 precision:0.121 specificity:0.997 prevalence:0.113
dc_clf = DecisionTreeClassifier(random_state=42, max_depth = 10)
dc_clf.fit(X_train, y_train)
dc_preds_proba = dc_clf.predict_proba(X_valid)[:,1]
dc_preds = dc_clf.predict(X_valid)
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
lr_valid_precision, lr_valid_specificity = print_report(y_valid,dc_preds_proba, 0.5)
AUC:0.621 accuracy:0.576 recall:0.617 precision:0.156 specificity:0.556 prevalence:0.113
importance = dc_clf.feature_importances_
indices = np.argsort(importance)[::-1][:5]
features = X_train.columns
print('Top 5 important variables:')
for i in indices:
print(features[i], importance[i])
Top 5 important variables: number_inpatient 0.2048007112320039 discharge_disposition_id 0.17745895981290255 Unnamed: 0 0.1157032083947385 num_medications 0.06891705668441435 num_lab_procedures 0.06413676475721254
rf_clf = RandomForestClassifier(random_state=111, max_depth = 6)
rf_clf.fit(X_train, y_train)
rf_preds = rf_clf.predict(X_valid)
rf_preds_proba = rf_clf.predict_proba(X_valid)[:, 1]
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
lr_valid_precision, lr_valid_specificity = print_report(y_valid,rf_preds_proba, 0.5)
AUC:0.664 accuracy:0.599 recall:0.644 precision:0.168 specificity:0.593 prevalence:0.113
rf_clf = RandomForestClassifier(random_state=111, max_depth = 6)
rf_clf.fit(X_train, y_train)
importance = rf_clf.feature_importances_
indices = np.argsort(importance)[::-1][:5]
features = X_train.columns
print('Top 5 important variables:')
for i in indices:
print(features[i], importance[i])
Top 5 important variables: number_inpatient 0.3027234895031829 discharge_disposition_id 0.16583873959191758 number_emergency 0.06512290444808795 num_medications 0.05209133801434666 number_diagnoses 0.049957598848719365
lsvc_clf = LinearSVC(random_state=111)
lsvc_clf.fit(X_train, y_train)
lsvc_preds = lsvc_clf.decision_function(X_valid)
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
lr_valid_precision, lr_valid_specificity = print_report(y_valid,lsvc_preds, 0.5)
AUC:0.644 accuracy:0.878 recall:0.073 precision:0.324 specificity:0.980 prevalence:0.113
importance = np.abs(lsvc_clf.coef_[0])
indices = np.argsort(importance)[::-1][:5]
features = X_train.columns
print('Top 5 important variables:')
for i in indices:
print(features[i], importance[i])
Top 5 important variables: number_inpatient 0.19860000502746336 diabetesMed 0.05867793199326875 admission_type_id 0.052252363330388296 metformin 0.048384335855877904 glimepiride 0.04765938820341512
gb_clf = GradientBoostingClassifier(n_estimators = 100, criterion='friedman_mse', learning_rate = 1.0, max_depth = 3,\
random_state = 111)
gb_clf.fit(X_train, y_train)
gb_preds = gb_clf.predict(X_valid)
gb_preds_proba = gb_clf.predict_proba(X_valid)[:, 1]
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
lr_valid_precision, lr_valid_specificity = print_report(y_valid,gb_preds_proba, 0.5)
AUC:0.625 accuracy:0.600 recall:0.571 precision:0.156 specificity:0.604 prevalence:0.113
importance = gb_clf.feature_importances_
indices = np.argsort(importance)[::-1][:5]
features = X_train.columns
print('Top 5 important variables:')
for i in indices:
print(features[i], importance[i])
Top 5 important variables: number_inpatient 0.20424845705452066 discharge_disposition_id 0.18498586187035168 Unnamed: 0 0.1723816558576928 num_medications 0.06102020776719027 num_lab_procedures 0.05517155454174269
xgb_clf = xgb.XGBClassifier(max_depth=3, learning_rate = 1.0, use_label_encoder = False,\
eval_metric = 'logloss')
xgb_clf.fit(X_train, y_train)
xgb_preds = xgb_clf.predict(X_valid)
xgb_preds_proba = xgb_clf.predict_proba(X_valid)[:, 1]
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
lr_valid_precision, lr_valid_specificity = print_report(y_valid,xgb_preds_proba, 0.5)
AUC:0.643 accuracy:0.608 recall:0.598 precision:0.164 specificity:0.609 prevalence:0.113
importance = xgb_clf.feature_importances_
indices = np.argsort(importance)[::-1][:5]
features = X_train.columns
print('Top 5 important variables:')
for i in indices:
print(f"{features[i]}: {importance[i]}")
Top 5 important variables: number_inpatient: 0.16212581098079681 discharge_disposition_id: 0.10900657624006271 admission_source_id: 0.042793795466423035 change: 0.034398604184389114 number_emergency: 0.0335787869989872
catb=CatBoostClassifier(iterations=200, depth=3, learning_rate=1.0, random_state = 111)
catb.fit(X_train, y_train)
catb_preds = catb.predict_proba(X_valid)[:, 1]
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
lr_valid_precision, lr_valid_specificity = print_report(y_valid,catb_preds, 0.5)
0: learn: 0.6648965 total: 162ms remaining: 32.2s 1: learn: 0.6600847 total: 168ms remaining: 16.6s 2: learn: 0.6526058 total: 173ms remaining: 11.4s 3: learn: 0.6509617 total: 178ms remaining: 8.73s 4: learn: 0.6483122 total: 184ms remaining: 7.16s 5: learn: 0.6457996 total: 189ms remaining: 6.12s 6: learn: 0.6449968 total: 194ms remaining: 5.35s 7: learn: 0.6437398 total: 200ms remaining: 4.79s 8: learn: 0.6425606 total: 205ms remaining: 4.36s 9: learn: 0.6416257 total: 211ms remaining: 4.01s 10: learn: 0.6407779 total: 217ms remaining: 3.73s 11: learn: 0.6395777 total: 223ms remaining: 3.49s 12: learn: 0.6381783 total: 228ms remaining: 3.28s 13: learn: 0.6373445 total: 234ms remaining: 3.11s 14: learn: 0.6362202 total: 239ms remaining: 2.95s 15: learn: 0.6351042 total: 245ms remaining: 2.82s 16: learn: 0.6342171 total: 251ms remaining: 2.7s 17: learn: 0.6333739 total: 257ms remaining: 2.6s 18: learn: 0.6323785 total: 262ms remaining: 2.5s 19: learn: 0.6316892 total: 267ms remaining: 2.41s 20: learn: 0.6308403 total: 272ms remaining: 2.32s 21: learn: 0.6304525 total: 278ms remaining: 2.25s 22: learn: 0.6297335 total: 283ms remaining: 2.18s 23: learn: 0.6288386 total: 289ms remaining: 2.12s 24: learn: 0.6279195 total: 295ms remaining: 2.06s 25: learn: 0.6253405 total: 300ms remaining: 2.01s 26: learn: 0.6250961 total: 306ms remaining: 1.96s 27: learn: 0.6244931 total: 312ms remaining: 1.92s 28: learn: 0.6238170 total: 317ms remaining: 1.87s 29: learn: 0.6233927 total: 324ms remaining: 1.83s 30: learn: 0.6225386 total: 330ms remaining: 1.8s 31: learn: 0.6217847 total: 335ms remaining: 1.76s 32: learn: 0.6213437 total: 341ms remaining: 1.73s 33: learn: 0.6203912 total: 346ms remaining: 1.69s 34: learn: 0.6196070 total: 352ms remaining: 1.66s 35: learn: 0.6190562 total: 357ms remaining: 1.63s 36: learn: 0.6180958 total: 363ms remaining: 1.6s 37: learn: 0.6174709 total: 369ms remaining: 1.57s 38: learn: 0.6167741 total: 374ms remaining: 1.54s 39: learn: 0.6162518 total: 380ms remaining: 1.52s 40: learn: 0.6156439 total: 385ms remaining: 1.49s 41: learn: 0.6146355 total: 391ms remaining: 1.47s 42: learn: 0.6143936 total: 396ms remaining: 1.45s 43: learn: 0.6138505 total: 402ms remaining: 1.43s 44: learn: 0.6137138 total: 408ms remaining: 1.4s 45: learn: 0.6129805 total: 413ms remaining: 1.38s 46: learn: 0.6120866 total: 419ms remaining: 1.36s 47: learn: 0.6117924 total: 425ms remaining: 1.34s 48: learn: 0.6108493 total: 431ms remaining: 1.33s 49: learn: 0.6105826 total: 437ms remaining: 1.31s 50: learn: 0.6100997 total: 443ms remaining: 1.29s 51: learn: 0.6091630 total: 448ms remaining: 1.27s 52: learn: 0.6084778 total: 453ms remaining: 1.26s 53: learn: 0.6078771 total: 459ms remaining: 1.24s 54: learn: 0.6071188 total: 465ms remaining: 1.23s 55: learn: 0.6063470 total: 470ms remaining: 1.21s 56: learn: 0.6057888 total: 475ms remaining: 1.19s 57: learn: 0.6057100 total: 481ms remaining: 1.18s 58: learn: 0.6052223 total: 487ms remaining: 1.16s 59: learn: 0.6047211 total: 493ms remaining: 1.15s 60: learn: 0.6041776 total: 498ms remaining: 1.13s 61: learn: 0.6038092 total: 504ms remaining: 1.12s 62: learn: 0.6033687 total: 509ms remaining: 1.11s 63: learn: 0.6028513 total: 514ms remaining: 1.09s 64: learn: 0.6020972 total: 520ms remaining: 1.08s 65: learn: 0.6016110 total: 525ms remaining: 1.07s 66: learn: 0.6013313 total: 532ms remaining: 1.05s 67: learn: 0.6006054 total: 539ms remaining: 1.04s 68: learn: 0.6004157 total: 545ms remaining: 1.03s 69: learn: 0.5998223 total: 551ms remaining: 1.02s 70: learn: 0.5997358 total: 556ms remaining: 1.01s 71: learn: 0.5992190 total: 562ms remaining: 998ms 72: learn: 0.5986042 total: 567ms remaining: 986ms 73: learn: 0.5980407 total: 572ms remaining: 975ms 74: learn: 0.5973969 total: 578ms remaining: 963ms 75: learn: 0.5970157 total: 585ms remaining: 954ms 76: learn: 0.5967069 total: 590ms remaining: 943ms 77: learn: 0.5959242 total: 597ms remaining: 934ms 78: learn: 0.5953736 total: 603ms remaining: 924ms 79: learn: 0.5948974 total: 609ms remaining: 913ms 80: learn: 0.5942228 total: 615ms remaining: 903ms 81: learn: 0.5935498 total: 620ms remaining: 893ms 82: learn: 0.5929510 total: 626ms remaining: 883ms 83: learn: 0.5926026 total: 632ms remaining: 873ms 84: learn: 0.5919484 total: 638ms remaining: 863ms 85: learn: 0.5912187 total: 644ms remaining: 853ms 86: learn: 0.5905671 total: 648ms remaining: 842ms 87: learn: 0.5897125 total: 653ms remaining: 831ms 88: learn: 0.5889698 total: 658ms remaining: 821ms 89: learn: 0.5884948 total: 664ms remaining: 812ms 90: learn: 0.5878020 total: 669ms remaining: 802ms 91: learn: 0.5873910 total: 674ms remaining: 792ms 92: learn: 0.5868850 total: 680ms remaining: 782ms 93: learn: 0.5863924 total: 685ms remaining: 773ms 94: learn: 0.5858218 total: 691ms remaining: 764ms 95: learn: 0.5852027 total: 697ms remaining: 756ms 96: learn: 0.5845692 total: 703ms remaining: 747ms 97: learn: 0.5838478 total: 709ms remaining: 738ms 98: learn: 0.5832592 total: 715ms remaining: 729ms 99: learn: 0.5827199 total: 720ms remaining: 720ms 100: learn: 0.5820375 total: 726ms remaining: 712ms 101: learn: 0.5819110 total: 732ms remaining: 703ms 102: learn: 0.5817337 total: 737ms remaining: 694ms 103: learn: 0.5815150 total: 743ms remaining: 686ms 104: learn: 0.5808800 total: 749ms remaining: 678ms 105: learn: 0.5805232 total: 755ms remaining: 669ms 106: learn: 0.5801613 total: 761ms remaining: 661ms 107: learn: 0.5797089 total: 766ms remaining: 653ms 108: learn: 0.5790927 total: 771ms remaining: 644ms 109: learn: 0.5784704 total: 777ms remaining: 636ms 110: learn: 0.5782979 total: 783ms remaining: 628ms 111: learn: 0.5777635 total: 789ms remaining: 620ms 112: learn: 0.5774884 total: 795ms remaining: 612ms 113: learn: 0.5768454 total: 800ms remaining: 604ms 114: learn: 0.5763961 total: 806ms remaining: 596ms 115: learn: 0.5755184 total: 811ms remaining: 587ms 116: learn: 0.5751418 total: 817ms remaining: 579ms 117: learn: 0.5746229 total: 823ms remaining: 572ms 118: learn: 0.5741923 total: 828ms remaining: 564ms 119: learn: 0.5741659 total: 834ms remaining: 556ms 120: learn: 0.5738050 total: 840ms remaining: 548ms 121: learn: 0.5732728 total: 845ms remaining: 540ms 122: learn: 0.5730028 total: 851ms remaining: 533ms 123: learn: 0.5725906 total: 856ms remaining: 525ms 124: learn: 0.5723736 total: 862ms remaining: 517ms 125: learn: 0.5719188 total: 866ms remaining: 509ms 126: learn: 0.5714192 total: 871ms remaining: 501ms 127: learn: 0.5710013 total: 877ms remaining: 493ms 128: learn: 0.5705755 total: 883ms remaining: 486ms 129: learn: 0.5700396 total: 887ms remaining: 478ms 130: learn: 0.5695674 total: 892ms remaining: 470ms 131: learn: 0.5689367 total: 897ms remaining: 462ms 132: learn: 0.5682862 total: 903ms remaining: 455ms 133: learn: 0.5677623 total: 908ms remaining: 447ms 134: learn: 0.5676602 total: 914ms remaining: 440ms 135: learn: 0.5671312 total: 920ms remaining: 433ms 136: learn: 0.5665800 total: 925ms remaining: 426ms 137: learn: 0.5660965 total: 931ms remaining: 418ms 138: learn: 0.5657441 total: 936ms remaining: 411ms 139: learn: 0.5652879 total: 942ms remaining: 404ms 140: learn: 0.5648533 total: 948ms remaining: 397ms 141: learn: 0.5644145 total: 953ms remaining: 389ms 142: learn: 0.5642917 total: 959ms remaining: 382ms 143: learn: 0.5637970 total: 963ms remaining: 375ms 144: learn: 0.5632082 total: 969ms remaining: 368ms 145: learn: 0.5626776 total: 975ms remaining: 361ms 146: learn: 0.5620795 total: 981ms remaining: 354ms 147: learn: 0.5615993 total: 986ms remaining: 346ms 148: learn: 0.5610694 total: 991ms remaining: 339ms 149: learn: 0.5607280 total: 997ms remaining: 332ms 150: learn: 0.5605829 total: 1s remaining: 325ms 151: learn: 0.5601025 total: 1.01s remaining: 319ms 152: learn: 0.5596164 total: 1.01s remaining: 312ms 153: learn: 0.5591669 total: 1.02s remaining: 305ms 154: learn: 0.5586668 total: 1.02s remaining: 298ms 155: learn: 0.5586449 total: 1.03s remaining: 291ms 156: learn: 0.5584920 total: 1.04s remaining: 284ms 157: learn: 0.5581205 total: 1.04s remaining: 277ms 158: learn: 0.5576830 total: 1.05s remaining: 270ms 159: learn: 0.5571034 total: 1.05s remaining: 263ms 160: learn: 0.5567958 total: 1.06s remaining: 256ms 161: learn: 0.5563886 total: 1.06s remaining: 250ms 162: learn: 0.5558839 total: 1.07s remaining: 243ms 163: learn: 0.5555623 total: 1.07s remaining: 236ms 164: learn: 0.5549317 total: 1.08s remaining: 229ms 165: learn: 0.5543260 total: 1.08s remaining: 222ms 166: learn: 0.5536682 total: 1.09s remaining: 216ms 167: learn: 0.5529090 total: 1.09s remaining: 209ms 168: learn: 0.5524482 total: 1.1s remaining: 202ms 169: learn: 0.5517712 total: 1.11s remaining: 195ms 170: learn: 0.5514550 total: 1.11s remaining: 189ms 171: learn: 0.5509897 total: 1.12s remaining: 182ms 172: learn: 0.5506147 total: 1.12s remaining: 175ms 173: learn: 0.5501039 total: 1.13s remaining: 169ms 174: learn: 0.5499846 total: 1.13s remaining: 162ms 175: learn: 0.5499359 total: 1.14s remaining: 155ms 176: learn: 0.5496269 total: 1.14s remaining: 149ms 177: learn: 0.5496103 total: 1.15s remaining: 142ms 178: learn: 0.5491435 total: 1.16s remaining: 136ms 179: learn: 0.5490699 total: 1.16s remaining: 129ms 180: learn: 0.5486602 total: 1.17s remaining: 123ms 181: learn: 0.5484101 total: 1.17s remaining: 116ms 182: learn: 0.5479443 total: 1.18s remaining: 110ms 183: learn: 0.5477453 total: 1.19s remaining: 103ms 184: learn: 0.5467985 total: 1.19s remaining: 96.6ms 185: learn: 0.5463339 total: 1.2s remaining: 90.1ms 186: learn: 0.5458361 total: 1.2s remaining: 83.7ms 187: learn: 0.5451832 total: 1.21s remaining: 77.2ms 188: learn: 0.5446521 total: 1.22s remaining: 70.7ms 189: learn: 0.5440583 total: 1.22s remaining: 64.4ms 190: learn: 0.5436080 total: 1.23s remaining: 57.9ms 191: learn: 0.5432104 total: 1.24s remaining: 51.5ms 192: learn: 0.5428181 total: 1.24s remaining: 45ms 193: learn: 0.5423089 total: 1.25s remaining: 38.6ms 194: learn: 0.5418352 total: 1.25s remaining: 32.2ms 195: learn: 0.5412436 total: 1.26s remaining: 25.7ms 196: learn: 0.5406283 total: 1.27s remaining: 19.3ms 197: learn: 0.5401505 total: 1.27s remaining: 12.9ms 198: learn: 0.5396560 total: 1.28s remaining: 6.42ms 199: learn: 0.5394535 total: 1.28s remaining: 0us AUC:0.634 accuracy:0.609 recall:0.583 precision:0.161 specificity:0.612 prevalence:0.113
recall_scoring = make_scorer(recall_score)
dc_grid = {'max_features':['auto','sqrt'], # maximum number of features to use at each split
'max_depth':range(1,11,1), # maximum depth of the tree
'min_samples_split':range(2,10,2), # minimum number of samples to split a node
'criterion':['gini','entropy']} # criterion for evaluating a split
dc_random = RandomizedSearchCV(estimator = dc_clf, param_distributions = dc_grid,
n_iter = 20, cv = 2, scoring=recall_scoring,
verbose = 1, random_state = 111)
dc_random.fit(X_train, y_train)
dc_random.best_params_
dc_hp_preds = dc_random.best_estimator_.predict(X_valid)
dc_hp_preds_proba = dc_random.best_estimator_.predict_proba(X_valid)[:,1]
roc_auc_score(y_valid, dc_hp_preds_proba)
Fitting 2 folds for each of 20 candidates, totalling 40 fits
0.5322862696909251
recall_score(y_valid, dc_hp_preds)
0.6798886569241476
xgb_grid = params = {
'min_child_weight': [1, 5, 8, 10],
'gamma': [0.5, 1, 1.5, 2, 5],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 0.9, 1.0],
'max_depth': [3, 4, 5]
} # criterion for evaluating a split
xgb_random = GridSearchCV(estimator = xgb_clf, param_grid = xgb_grid,
cv = 2, scoring = recall_scoring,
verbose = 1)
xgb_random.fit(X_train, y_train)
xgb_random.best_params_
xgb_hp_preds = xgb_random.best_estimator_.predict(X_valid)
xgb_hp_preds_proba = xgb_random.best_estimator_.predict_proba(X_valid)[:,1]
roc_auc_score(y_valid, xgb_hp_preds_proba)
Fitting 2 folds for each of 720 candidates, totalling 1440 fits
0.6553197798173325
recall_score(y_valid, xgb_hp_preds)
0.6339596381350034
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, recall_score, precision_score, confusion_matrix
import numpy as np
# define the parameter grid to search over
param_grid = {
'n_estimators': [50, 100],
'max_depth': [6, 8],
'max_features': ['sqrt', 'log2']
}
# create a function to calculate all the evaluation metrics
def print_report(y_true, y_pred_prob, threshold):
y_pred = (y_pred_prob >= threshold).astype(int)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
auc = roc_auc_score(y_true, y_pred_prob)
accuracy = accuracy_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
specificity = tn / (tn + fp)
prevalence = np.mean(y_true)
print("AUC:{:.3f}".format(auc))
print("accuracy:{:.3f}".format(accuracy))
print("recall:{:.3f}".format(recall))
print("precision:{:.3f}".format(precision))
print("specificity:{:.3f}".format(specificity))
print("prevalence:{:.3f}".format(prevalence))
# create the random forest classifier with default parameters
rf_clf = RandomForestClassifier(random_state=111)
# create a dictionary to store the evaluation metrics for each combination of hyperparameters
results = {}
# perform a grid search to find the best combination of hyperparameters
clf = GridSearchCV(rf_clf, param_grid, scoring=make_scorer(roc_auc_score), cv=3, n_jobs=-1)
clf.fit(X_train, y_train)
# print the best hyperparameters and their corresponding scores
print("Best hyperparameters: ", clf.best_params_)
best_clf = clf.best_estimator_
rf_preds_proba = best_clf.predict_proba(X_valid)[:, 1]
print_report(y_valid, rf_preds_proba, 0.5)
Best hyperparameters: {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 100}
AUC:0.669
accuracy:0.619
recall:0.622
precision:0.173
specificity:0.619
prevalence:0.113
importances = best_clf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%d. %s (%f)" % (f + 1, X_train.columns[indices[f]], importances[indices[f]]))
1. number_inpatient (0.241661) 2. discharge_disposition_id (0.130218) 3. Unnamed: 0 (0.073085) 4. num_medications (0.058445) 5. num_lab_procedures (0.056034) 6. number_emergency (0.054367) 7. number_diagnoses (0.047471) 8. age (0.043715) 9. diag_1 (0.043168) 10. time_in_hospital (0.041978) 11. number_outpatient (0.024163) 12. num_procedures (0.023491) 13. insulin (0.021783) 14. A1Cresult (0.014635) 15. metformin (0.013837) 16. race (0.013692) 17. admission_type_id (0.013684) 18. admission_source_id (0.012857) 19. diabetesMed (0.010361) 20. glipizide (0.009341) 21. change (0.008494) 22. glyburide (0.007365) 23. pioglitazone (0.007250) 24. gender (0.007002) 25. glimepiride (0.005691) 26. rosiglitazone (0.004892) 27. repaglinide (0.004155) 28. nateglinide (0.002187) 29. glyburide-metformin (0.002112) 30. max_glu_serum (0.001936) 31. acarbose (0.000724) 32. chlorpropamide (0.000091) 33. tolazamide (0.000049) 34. miglitol (0.000026) 35. tolbutamide (0.000023) 36. glipizide-metformin (0.000019) 37. examide (0.000000) 38. citoglipton (0.000000) 39. troglitazone (0.000000) 40. glimepiride-pioglitazone (0.000000) 41. metformin-rosiglitazone (0.000000) 42. metformin-pioglitazone (0.000000) 43. acetohexamide (0.000000)